Diagnostics

November 13, 2024

Diagnostics

library(haven)
library(tidyverse)
library(margins)
library(jtools)
library(ggrepel)
library(lmtest)
library(car)
library(estimatr)

nes <- read_dta("nes.dta")
states <- read_dta("states.dta")

Multicollinearity

  • Estimate multiple regression model
a <- lm(ft_Trump_pre ~ partyid7 + libcon7 + better_worse_past_econ + Female + educ4 + 
          as.factor(Race3) + income5 + Age, data=nes)
summary(a)

Call:
lm(formula = ft_Trump_pre ~ partyid7 + libcon7 + better_worse_past_econ + 
    Female + educ4 + as.factor(Race3) + income5 + Age, data = nes)

Residuals:
    Min      1Q  Median      3Q     Max 
-93.063 -14.899  -0.285  15.105 116.722 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -15.74465    2.34932  -6.702 2.39e-11 ***
partyid7                 6.97994    0.26640  26.201  < 2e-16 ***
libcon7                  4.64589    0.37594  12.358  < 2e-16 ***
better_worse_past_econ   5.37718    0.42656  12.606  < 2e-16 ***
Female                  -3.40590    0.81809  -4.163 3.21e-05 ***
educ4                   -3.36747    0.44403  -7.584 4.27e-14 ***
as.factor(Race3)2       -9.46221    1.45316  -6.511 8.50e-11 ***
as.factor(Race3)3      -10.59720    1.33794  -7.921 3.15e-15 ***
income5                 -0.61813    0.33310  -1.856  0.06359 .  
Age                      0.06952    0.02354   2.953  0.00317 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 23.85 on 3514 degrees of freedom
  (747 observations deleted due to missingness)
Multiple R-squared:  0.5393,    Adjusted R-squared:  0.5381 
F-statistic: 457.1 on 9 and 3514 DF,  p-value: < 2.2e-16

Multicollinearity

  • Correlation matrix
nes |> 
  drop_na(partyid7, libcon7, better_worse_past_econ) |> 
  select(partyid7, libcon7, better_worse_past_econ) |> 
  cor()
                        partyid7   libcon7 better_worse_past_econ
partyid7               1.0000000 0.6508763              0.4087402
libcon7                0.6508763 1.0000000              0.3893138
better_worse_past_econ 0.4087402 0.3893138              1.0000000

Multicollinearity

  • Variance inflation factor
## Variance inflation factor from car package
vif(a)
                           GVIF Df GVIF^(1/(2*Df))
partyid7               2.098128  1        1.448492
libcon7                1.935877  1        1.391358
better_worse_past_econ 1.291915  1        1.136624
Female                 1.033020  1        1.016376
educ4                  1.255323  1        1.120412
as.factor(Race3)       1.249240  2        1.057211
income5                1.261215  1        1.123038
Age                    1.062319  1        1.030689

Heteroscedasticity

  • Regression model
b <- lm(vep16_turnout ~ south + ba_or_more_2015 + hispanicpct_2016 + blackpct_2016, data=states)
summary(b)

Call:
lm(formula = vep16_turnout ~ south + ba_or_more_2015 + hispanicpct_2016 + 
    blackpct_2016, data = states)

Residuals:
    Min      1Q  Median      3Q     Max 
-20.340  -2.163  -0.181   2.505   9.277 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      44.72964    4.92568   9.081 9.74e-12 ***
south            -3.33120    2.39843  -1.389 0.171696    
ba_or_more_2015   0.65107    0.15814   4.117 0.000162 ***
hispanicpct_2016 -0.19162    0.07175  -2.671 0.010494 *  
blackpct_2016     0.07586    0.11132   0.682 0.499043    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.082 on 45 degrees of freedom
Multiple R-squared:  0.4169,    Adjusted R-squared:  0.3651 
F-statistic: 8.044 on 4 and 45 DF,  p-value: 5.559e-05

Heteroscedasticity

## Save residuals and y-hats to states data
states <- states |> mutate(res=residuals(b))
states <- states |> mutate(yhat=fitted(b))

Graph

#Graph residuals against y-hats
states |> 
  ggplot(aes(x=yhat, y=res)) + 
  geom_point() +
  geom_hline(yintercept = 0)

Graph

# Graph residuals against x variables, look for patterns
states |> 
  ggplot(aes(x=ba_or_more_2015, y=res)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

Graph

states |> 
  ggplot(aes(x=hispanicpct_2016, y=res)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

Graph

states |> 
  ggplot(aes(x=blackpct_2016, y=res)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

Graph

states |> 
  ggplot(aes(x=south, y=res)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

Breusch-Pagan Test for Heteroskedasticity

bptest(b, studentize = FALSE)

    Breusch-Pagan test

data:  b
BP = 5.9558, df = 4, p-value = 0.2025

Breusch-Pagan Covariate Specific Tests

bptest(b, ~ba_or_more_2015, data=states, studentize = FALSE)

    Breusch-Pagan test

data:  b
BP = 0.73547, df = 1, p-value = 0.3911

Breusch-Pagan Covariate Specific Tests

bptest(b, ~hispanicpct_2016, data=states, studentize = FALSE)

    Breusch-Pagan test

data:  b
BP = 0.25711, df = 1, p-value = 0.6121

Breusch-Pagan Covariate Specific Tests

bptest(b, ~blackpct_2016, data=states, studentize = FALSE)

    Breusch-Pagan test

data:  b
BP = 4.4938, df = 1, p-value = 0.03402

Breusch-Pagan Covariate Specific Tests

bptest(b, ~south, data=states, studentize = FALSE)

    Breusch-Pagan test

data:  b
BP = 1.4879, df = 1, p-value = 0.2225

Robust Standard Errors

c <- lm_robust(vep16_turnout ~ south + ba_or_more_2015 + hispanicpct_2016 + 
                 blackpct_2016, se_type="stata", data=states)
summary(c)

Call:
lm_robust(formula = vep16_turnout ~ south + ba_or_more_2015 + 
    hispanicpct_2016 + blackpct_2016, data = states, se_type = "stata")

Standard error type:  HC1 

Coefficients:
                 Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)      44.72964    3.12417 14.3173 2.282e-18 38.43725 51.02204 45
south            -3.33120    1.89671 -1.7563 8.584e-02 -7.15137  0.48897 45
ba_or_more_2015   0.65107    0.09834  6.6207 3.728e-08  0.45301  0.84913 45
hispanicpct_2016 -0.19162    0.05198 -3.6865 6.093e-04 -0.29631 -0.08693 45
blackpct_2016     0.07586    0.08711  0.8709 3.884e-01 -0.09958  0.25131 45

Multiple R-squared:  0.4169 ,   Adjusted R-squared:  0.3651 
F-statistic: 17.11 on 4 and 45 DF,  p-value: 1.34e-08

Unusual Observations (Outliers and Influence)

## Studentized residuals for outliers
## Save st. res. to states data
states <- states |> mutate(sres=rstudent(b))

# Generate observation counter (1 to 50) 
states <- states |> mutate(obs=1:nrow(states))

Graph for Outliers

states |>
  ggplot(aes(x=obs, y=sres, label=StateID)) + 
  geom_point() + 
  geom_text_repel() +
  geom_hline(yintercept=2, linetype="dashed") + 
  geom_hline(yintercept=-2, linetype="dashed")

Influence Statistics, DFBETAS

states <- states |> mutate(dfb=dfbetas(b))

Graph Influence Statistics, DFBETAS

states |>
  ggplot(aes(x=obs, y=dfb[,"ba_or_more_2015"], label=StateID)) + 
  geom_point() + 
  geom_text_repel() + 
  geom_hline(yintercept=2/sqrt(50)) + 
  geom_hline(yintercept=-2/sqrt(50))

Graph Influence Statistics, DFBETAS

states |>
  ggplot(aes(x=obs, y=dfb[,"south"], label=StateID)) + 
  geom_point() + 
  geom_text_repel() + 
  geom_hline(yintercept=2/sqrt(50)) + 
  geom_hline(yintercept=-2/sqrt(50))

Graph Influence Statistics, DFBETAS

states |>
  ggplot(aes(x=obs, y=dfb[,"hispanicpct_2016"], label=StateID)) + 
  geom_point() + 
  geom_text_repel() + 
  geom_hline(yintercept=2/sqrt(50)) + 
  geom_hline(yintercept=-2/sqrt(50))

Graph Influence Statistics, DFBETAS

states |>
  ggplot(aes(x=obs, y=dfb[,"blackpct_2016"], label=StateID)) + 
  geom_point() + 
  geom_text_repel() + 
  geom_hline(yintercept=2/sqrt(50)) + 
  geom_hline(yintercept=-2/sqrt(50))

Normality of Errors

#Density plot of residuals
states |> 
  ggplot(aes(x=res)) + 
  geom_density()

Normality of Errors

# Jarque Bera test for normality; install tsoutliers package
library(tsoutliers)
JarqueBera.test(residuals(b))

    Jarque Bera Test

data:  residuals(b)
X-squared = 55.123, df = 2, p-value = 1.072e-12


    Skewness

data:  residuals(b)
statistic = 1.138, p-value = 0.001019


    Kurtosis

data:  residuals(b)
statistic = 7.6129, p-value = 2.773e-11